1 Introduction

2 Load the required libraries

library(readr) #to load csv data.
library(dplyr) #data manipulation
library(ggplot2)
library(plotly)
library(DataExplorer)
library(naniar)
library(broom)
library(DT)

3 Load the data

housing_data<-read_csv('data/challenge 1 dataset (housing).csv')
## Warning: Missing column names filled in: 'X1' [1]
## Parsed with column specification:
## cols(
##   X1 = col_double(),
##   learner_id = col_double(),
##   program_code = col_character(),
##   variation_code = col_character(),
##   message_in = col_character(),
##   message_out = col_character(),
##   created_at = col_datetime(format = ""),
##   user_response = col_character()
## )

4 IDA

Introduce the data

introduce(housing_data)
## # A tibble: 1 x 9
##     rows columns discrete_columns continuous_colu~ all_missing_col~
##    <int>   <int>            <int>            <int>            <int>
## 1 422868       8                6                2                0
## # ... with 4 more variables: total_missing_values <int>,
## #   complete_rows <int>, total_observations <int>, memory_usage <dbl>

Plot the data introduction

plot_intro(housing_data)

Look at the columns that have missing values

miss_var_summary(housing_data)
## # A tibble: 8 x 3
##   variable       n_miss pct_miss
##   <chr>           <int>    <dbl>
## 1 user_response   13593   3.21  
## 2 message_in       1813   0.429 
## 3 message_out       146   0.0345
## 4 X1                  0   0     
## 5 learner_id          0   0     
## 6 program_code        0   0     
## 7 variation_code      0   0     
## 8 created_at          0   0

Plot the missing data

plot_missing(housing_data)

The columns that have missing data have less than 5% of their values missing and since we have relatively many observations we may just drop the observations that have these missing values. However I will retain these observations for the purposes of answering the questions.

Look at the internal structure

glimpse(housing_data)
## Observations: 422,868
## Variables: 8
## $ X1             <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, ...
## $ learner_id     <dbl> 8, 8, 8, 8, 8, 11, 11, 11, 11, 11, 11, 11, 11, ...
## $ program_code   <chr> "HFH", "HFH", "PL", "PL", "PL", "HFH", "HFH", "...
## $ variation_code <chr> "HFH", "HFH", "PL", "PL", "PL", "HFH", "HFH", "...
## $ message_in     <chr> "hfh", "a", "1", "a", "a", "HFH", "A", "1", "A"...
## $ message_out    <chr> "Housing is a basic need but not everyone can a...
## $ created_at     <dttm> 2017-12-20 11:55:19, 2017-12-20 11:56:19, 2017...
## $ user_response  <chr> "a", "1", "a", "a", "ACCESS|DIGI", "A", "1", "A...

5 Questions

5.1 How many learners were in the housing trainings project?

nrow(housing_data)
## [1] 422868

422868 Learners

5.3 Which learner had the most interactions and how many interactions did they have?

housing_data%>%
  group_by(learner_id)%>%tally(n='total_number_of_interactions')%>%arrange(desc(total_number_of_interactions))%>%head(1)
## # A tibble: 1 x 2
##   learner_id total_number_of_interactions
##        <dbl>                        <int>
## 1     648424                         1172

learner_id total_number_of_interactions 648424 1172

5.4 How many learners had more than 100 interactions (>100) ?

housing_data%>%
  group_by(learner_id)%>%tally(n='total_number_of_interactions')%>%filter(total_number_of_interactions>100)%>%nrow()
## [1] 756

756

5.5 How many learners had 100 or less interactions (<= 100)

housing_data%>%
  group_by(learner_id)%>%tally(n='total_number_of_interactions')%>%filter(total_number_of_interactions <= 100)%>%nrow()
## [1] 14885

14885

5.6 For the 100 or less learners derive summary statistics of their interactions count and visualize the distribution of their interaction counts (i.e interactions count is the number of interactions they had in the project).

5.6.1 Derive the data for learners that had 100 or less interactions

# derive data for learners that had 100 or less interactions
interactions_count_by_id<-housing_data%>%
  group_by(learner_id)%>%tally(n='interactions_count')%>%filter(interactions_count <= 100)

5.6.2 Summary statistics

summary(interactions_count_by_id$interactions_count)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    4.00   14.00   20.87   30.00  100.00

The mean is greater than the median hence interactions count is positively skewed.

5.6.3 Distribution of interaction counts

ggplotly(
interactions_count_by_id%>%
  ggplot(aes(interactions_count))+
  geom_histogram(bins = 10))

5.7 Which day of the week had the most interactions and which had the least interactions?

5.7.1 day of the week that had the most interactions

housing_data%>%
  group_by(weekdays(created_at))%>%
  tally()%>%arrange(desc(n))%>%head(1)
## # A tibble: 1 x 2
##   `weekdays(created_at)`      n
##   <chr>                   <int>
## 1 Wednesday              145590

Wednesday with 145590 interactions.

5.7.2 day of the week that had the least interactions

housing_data%>%
  group_by(weekdays(created_at))%>%
  tally()%>%arrange(n)%>%head(1)
## # A tibble: 1 x 2
##   `weekdays(created_at)`     n
##   <chr>                  <int>
## 1 Monday                 14096

Monday with 14096 interactions.